In [1]:
# Allow us to load `open_cp` without installing
import sys, os.path
sys.path.insert(0, os.path.abspath(".."))

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import open_cp

In [2]:
import open_cp.sources.chicago as chicago

points = chicago.default_burglary_data()
points


Out[2]:
<open_cp.data.TimedPoints at 0x23bc55c55f8>

In [3]:
len(points.timestamps), points.time_range


Out[3]:
(59209,
 (numpy.datetime64('2016-03-17T05:06:00.000'),
  numpy.datetime64('2017-03-16T23:30:00.000')))

In [4]:
bbox = points.bounding_box
print("X coord range:", bbox.xmin, bbox.xmax)
print("Y coord range:", bbox.ymin, bbox.ymax)
print(bbox.aspect_ratio)


X coord range: 334018.117311 367319.649846
Y coord range: 552879.750308 594819.314566
1.25938841447

In [5]:
_, ax = plt.subplots(figsize=(10,10 * bbox.aspect_ratio))
ax.scatter(points.coords[0], points.coords[1], alpha=0.1, marker="o", s=1)


Out[5]:
<matplotlib.collections.PathCollection at 0x23bc753c5f8>

As an American city, most streets run North-South or East-West. Further, the data is geocoded to the centre of the "block", to anonymise the data. (Though this is slightly inconsistent, if one looks closely at the raw CSV file.)

In the plot above:

  • the clump at the upper left is the airport.
  • We see a large clump of theft data downtown.
  • It would be interesting to know what causes the visible lines running north north west from downtown.

In [6]:
mask = ( (points.xcoords >= 355000) & (points.xcoords <= 365000) &
        (points.ycoords >= 575000) & (points.ycoords <= 585000) )

downtown = points[mask]

bbox = downtown.bounding_box
print("X coord range:", bbox.xmin, bbox.xmax)
print("Y coord range:", bbox.ymin, bbox.ymax)


X coord range: 355000.54864 360647.882859
Y coord range: 575007.0104 584997.43968

In [7]:
_, ax = plt.subplots(figsize=(5, 5 * bbox.aspect_ratio))
ax.scatter(downtown.coords[0], downtown.coords[1], alpha=0.1, marker="o", s=1)


Out[7]:
<matplotlib.collections.PathCollection at 0x23bc7570390>

UK Crime data

We use an example of January 2017 from West Yorkshire.


In [8]:
import open_cp.sources.ukpolice as ukpolice
points = ukpolice.default_burglary_data()
len(points.timestamps)


Out[8]:
2358

In [9]:
bbox = points.bounding_box

fig, ax = plt.subplots(figsize=(10, 10 * bbox.aspect_ratio))
ax.scatter(points.xcoords, points.ycoords, s=10, alpha=0.2)


Out[9]:
<matplotlib.collections.PathCollection at 0x23bc75d49e8>

These are longitude / latitude points, which distort distance. Assuming you have pyproj installed, you can project. For the UK, we use British National Grid


In [10]:
import open_cp

In [11]:
projected_points = open_cp.data.points_from_lon_lat(points, epsg=7405)

In [12]:
bbox = projected_points.bounding_box

fig, ax = plt.subplots(figsize=(10, 10 * bbox.aspect_ratio))
ax.scatter(projected_points.xcoords, projected_points.ycoords, s=10, alpha=0.2)


Out[12]:
<matplotlib.collections.PathCollection at 0x23bc7973860>

Random data


In [13]:
import open_cp.sources.random as random
import datetime

In [14]:
region = open_cp.RectangularRegion(390000, 450000, 410000, 450000)
points = random.random_uniform(region, datetime.date(2017,1,1), datetime.date(2017,3,1), 1000)

points.time_range


Out[14]:
(numpy.datetime64('2017-01-01T01:02:23.000'),
 numpy.datetime64('2017-02-28T23:16:49.000'))

In [15]:
bbox = points.bounding_box

fig, ax = plt.subplots(figsize=(10, 10 * bbox.aspect_ratio))
ax.scatter(*points.coords, s=10, alpha=0.2)


Out[15]:
<matplotlib.collections.PathCollection at 0x23bc7b29748>

If we have scipy installed, we can quickly use a 2D Gaussian kernel density estimation to get an estimate of the "risk intensity" from the real West Yorkshire data.


In [16]:
import scipy.stats

In [17]:
kernel = scipy.stats.gaussian_kde(projected_points.coords)

In [18]:
X, Y = np.mgrid[bbox.xmin:bbox.xmax:100j, bbox.ymin:bbox.ymax:100j]
positions = np.vstack([X.ravel(), Y.ravel()])
Z = np.reshape(kernel(positions), X.shape)
np.max(Z)


Out[18]:
3.2111553812339177e-09

In [19]:
plt.imshow(np.rot90(Z))


Out[19]:
<matplotlib.image.AxesImage at 0x23bc9bdfbe0>

In [20]:
sampler = random.KernelSampler(region, kernel, 4e-9)
points = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)

In [21]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(*projected_points.coords, s=10, alpha=0.2)
ax[1].scatter(*points.coords, s=10, alpha=0.2)
for i in [0, 1]:
    ax[i].set_aspect(bbox.aspect_ratio)
    ax[i].set(xlim=[bbox.xmin, bbox.xmax], ylim=[bbox.ymin, bbox.ymax])
ax[0].set_title("Real data, Jan 2017")
_ = ax[1].set_title("Gaussian KDE sample")


The real plot still looks somewhat different to the random test data, suggesting that a simple fixed bandwidth Gaussian KDE is not appropriate (which we already knew...)

Using a nearest neighbour variable bandwidth Gaussian KDE


In [22]:
import open_cp.kernels

In [23]:
kernel = open_cp.kernels.kth_nearest_neighbour_gaussian_kde(projected_points.coords, k=10)
sampler = random.KernelSampler(region, kernel, 4e-9)
points10 = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)

kernel = open_cp.kernels.kth_nearest_neighbour_gaussian_kde(projected_points.coords, k=25)
sampler = random.KernelSampler(region, kernel, 4e-9)
points25 = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)

kernel = open_cp.kernels.kth_nearest_neighbour_gaussian_kde(projected_points.coords, k=50)
sampler = random.KernelSampler(region, kernel, 4e-9)
points50 = random.random_spatial(sampler, datetime.date(2017,1,1), datetime.date(2017,3,1), 2350)

In [24]:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(15, 9))
ax[0,0].scatter(*projected_points.coords, s=10, alpha=0.2)
ax[0,1].scatter(*points10.coords, s=10, alpha=0.2)
ax[1,0].scatter(*points25.coords, s=10, alpha=0.2)
ax[1,1].scatter(*points50.coords, s=10, alpha=0.2)
for a in ax.ravel():
    a.set_aspect(bbox.aspect_ratio)
    a.set(xlim=[bbox.xmin, bbox.xmax], ylim=[bbox.ymin, bbox.ymax])
ax[0,0].set_title("Real data, Jan 2017")
ax[0,1].set_title("k=10 nearest neighbour sample")
ax[1,0].set_title("k=25 nearest neighbour sample")
ax[1,1].set_title("k=50 nearest neighbour sample")
fig.tight_layout()
None


Visually, having a rather narrow bandwidth seems to look better.

I suspect that to produce more realistic simulations, to geography of the data needs to be investigated: i.e. locate the points onto buildings and into the real street network.

Self-exciting point process sampler

Inhomogeneous Poisson process


In [25]:
import open_cp.sources.sepp as sepp

In [26]:
region = open_cp.RectangularRegion(0,100,0,100)
kernel = sepp.PoissonTimeGaussianSpace(1, [50, 50], [150, 25], 0.8)

sampler = sepp.InhomogeneousPoisson(region, kernel)

In [27]:
points = sampler.sample(0, 100)

In [28]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(points[1], points[2])
ax[0].set_title("Space location")
ax[0].set_aspect(1)
ax[0].set_xlim(0,100)
ax[0].set_ylim(0,100)
ax[1].scatter(points[0], points[1])
ax[1].set_xlabel("time")
ax[1].set_ylabel("x coord")
ax[1].set_title("X location against time")
None


The coordinates in space give samples from a 2D correlated Gaussian distribution, as we expect.

If we do this repeatedly, then the time coordinates along should give a poisson process.


In [29]:
counts = []
window = []
for _ in range(10000):
    times = sampler.sample(0,100)[0]
    counts.append(len(times))
    window.append(np.sum(times <= 20))
fig, ax = plt.subplots(ncols=2, figsize=(16, 4))
ax[0].hist(counts)
ax[0].set_title("Number of points")
ax[1].hist(window)
ax[1].set_title("In window [0,20]")
None


Inhomogeneous Poisson process via factorisation

If the intensity function of the poisson process has the form $\lambda(t,x,y) = \nu(t)\mu(x,y)$ then we can simulate the time-only Poission process with density $\nu$, and then sample the space dimension as if it were a "mark" (see the notion of a "marked Poisson process" in the literature). If $\mu$ is a probability density of a standard type, this is much faster, because we can very easily draw samples for the space dimensions.


In [30]:
time_kernel = sepp.Exponential(exp_rate=1, total_rate=10)
space_sampler = sepp.GaussianSpaceSampler([50, 50], [150, 25], 0.8)
sampler = sepp.InhomogeneousPoissonFactors(time_kernel, space_sampler)

In [31]:
points = sampler.sample(0, 100)

fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(points[1], points[2])
ax[0].set_title("Space location")
ax[0].set_aspect(1)
ax[0].set_xlim(0,100)
ax[0].set_ylim(0,100)
ax[1].scatter(points[0], points[1])
ax[1].set_xlabel("time")
ax[1].set_ylabel("x coord")
ax[1].set_title("X location against time")
None


Self-excited point process sampler

You need to pass two intensity functions (aka kernels), one for the background events, and one for the triggered events.

In the following example, the background sampler has as time component a constant rate poisson process, and a Gaussian space density, centred at (50,50).

The trigger kernel has an exponential density in time (so on average each event triggers one further event) and a space kernel which is deliberate biases to jump around 5 units in the x direction. We can hence visualise the cascade of triggered events as a rightward drift on the first graph, and an upward drift on the second graph.


In [32]:
background_sampler = sepp.InhomogeneousPoissonFactors(sepp.HomogeneousPoisson(1),
    sepp.GaussianSpaceSampler([50,50], [50,50], 0))

time_kernel = sepp.Exponential(exp_rate=1, total_rate=1)
space_sampler = sepp.GaussianSpaceSampler([5, 0], [1, 1], 0)
trigger_sampler = sepp.InhomogeneousPoissonFactors(time_kernel, space_sampler)

sampler = sepp.SelfExcitingPointProcess(background_sampler, trigger_sampler)

In [33]:
points = sampler.sample(0,10)

In [34]:
fig, ax = plt.subplots(ncols=2, figsize=(16, 6))
ax[0].scatter(points[1], points[2])
ax[0].set_title("Space location")
ax[0].set_aspect(1)
ax[0].set_xlim(0,100)
ax[0].set_ylim(0,100)
ax[1].scatter(points[0], points[1])
ax[1].set_xlabel("time")
ax[1].set_ylabel("x coord")
ax[1].set_title("X location against time")
None



In [ ]: